suppressPackageStartupMessages({
library(data.table)
library(DESeq2)
library(gplots)
library(here)
library(hyperSpec)
library(RColorBrewer)
library(tidyverse)
library(VennDiagram)
})
suppressMessages({
source(here("UPSCb-common/Rtoolbox/src/plotEnrichedTreemap.R"))
source(here("UPSCb-common/src/R/featureSelection.R"))
source(here("UPSCb-common/src/R/volcanoPlot.R"))
source(here("UPSCb-common/src/R/gopher.R"))
})
pal=brewer.pal(8,"Dark2")
hpal <- colorRampPalette(c("blue","white","red"))(100)
mar <- par("mar")
"line_plot" <- function(dds=dds,vst=vst,gene_id=gene_id){
message(paste("Plotting",gene_id))
sel <- grepl(gene_id,rownames(vst))
stopifnot(sum(sel)==1)
p <- ggplot(bind_cols(as.data.frame(colData(dds)),
data.frame(value=vst[sel,])),
aes(x=Level,y=value,col=Level,group=Level)) +
geom_point() + geom_smooth() +
scale_y_continuous(name="VST expression") +
ggtitle(label=paste("Expression for: ",gene_id))
suppressMessages(suppressWarnings(plot(p)))
return(NULL)
}
"extract_results" <- function(dds,vst,contrast,
padj=0.01,lfc=0.5,
plot=TRUE,verbose=TRUE,
export=TRUE,default_dir=here("data/analysis/DE"),
default_prefix="DE-",
labels=colnames(dds),
sample_sel=1:ncol(dds),
expression_cutoff=0,
debug=FALSE,filter=c("median",NULL),...){
# get the filter
if(!is.null(match.arg(filter))){
filter <- rowMedians(counts(dds,normalized=TRUE))
message("Using the median normalized counts as default, set filter=NULL to revert to using the mean")
}
# validation
if(length(contrast)==1){
res <- results(dds,name=contrast,filter = filter)
} else {
res <- results(dds,contrast=contrast,filter = filter)
}
stopifnot(length(sample_sel)==ncol(vst))
if(plot){
par(mar=c(5,5,5,5))
volcanoPlot(res)
par(mar=mar)
}
# a look at independent filtering
if(plot){
plot(metadata(res)$filterNumRej,
type="b", ylab="number of rejections",
xlab="quantiles of filter")
lines(metadata(res)$lo.fit, col="red")
abline(v=metadata(res)$filterTheta)
}
if(verbose){
message(sprintf("The independent filtering cutoff is %s, removing %s of the data",
round(metadata(res)$filterThreshold,digits=5),
names(metadata(res)$filterThreshold)))
max.theta <- metadata(res)$filterNumRej[which.max(metadata(res)$filterNumRej$numRej),"theta"]
message(sprintf("The independent filtering maximises for %s %% of the data, corresponding to a base mean expression of %s (library-size normalised read)",
round(max.theta*100,digits=5),
round(quantile(counts(dds,normalized=TRUE),probs=max.theta),digits=5)))
}
if(plot){
qtl.exp=quantile(counts(dds,normalized=TRUE),probs=metadata(res)$filterNumRej$theta)
dat <- data.frame(thetas=metadata(res)$filterNumRej$theta,
qtl.exp=qtl.exp,
number.degs=sapply(lapply(qtl.exp,function(qe){
res$padj <= padj & abs(res$log2FoldChange) >= lfc &
! is.na(res$padj) & res$baseMean >= qe
}),sum))
if(debug){
plot(ggplot(dat,aes(x=thetas,y=qtl.exp)) +
geom_line() + geom_point() +
scale_x_continuous("quantiles of expression") +
scale_y_continuous("base mean expression") +
geom_hline(yintercept=expression_cutoff,
linetype="dotted",col="red"))
p <- ggplot(dat,aes(x=thetas,y=qtl.exp)) +
geom_line() + geom_point() +
scale_x_continuous("quantiles of expression") +
scale_y_log10("base mean expression") +
geom_hline(yintercept=expression_cutoff,
linetype="dotted",col="red")
suppressMessages(suppressWarnings(plot(p)))
plot(ggplot(dat,aes(x=thetas,y=number.degs)) +
geom_line() + geom_point() +
geom_hline(yintercept=dat$number.degs[1],linetype="dashed") +
scale_x_continuous("quantiles of expression") +
scale_y_continuous("Number of DE genes"))
plot(ggplot(dat,aes(x=thetas,y=number.degs[1] - number.degs),aes()) +
geom_line() + geom_point() +
scale_x_continuous("quantiles of expression") +
scale_y_continuous("Cumulative number of DE genes"))
plot(ggplot(data.frame(x=dat$thetas[-1],
y=diff(dat$number.degs[1] - dat$number.degs)),aes(x,y)) +
geom_line() + geom_point() +
scale_x_continuous("quantiles of expression") +
scale_y_continuous("Number of DE genes per interval"))
plot(ggplot(data.frame(x=dat$qtl.exp[-1],
y=diff(dat$number.degs[1] - dat$number.degs)),aes(x,y)) +
geom_line() + geom_point() +
scale_x_continuous("base mean of expression") +
scale_y_continuous("Number of DE genes per interval"))
p <- ggplot(data.frame(x=dat$qtl.exp[-1],
y=diff(dat$number.degs[1] - dat$number.degs)),aes(x,y)) +
geom_line() + geom_point() +
scale_x_log10("base mean of expression") +
scale_y_continuous("Number of DE genes per interval") +
geom_vline(xintercept=expression_cutoff,
linetype="dotted",col="red")
suppressMessages(suppressWarnings(plot(p)))
}
}
sel <- res$padj <= padj & abs(res$log2FoldChange) >= lfc & ! is.na(res$padj) &
res$baseMean >= expression_cutoff
if(verbose){
message(sprintf(paste(
ifelse(sum(sel)==1,
"There is %s gene that is DE",
"There are %s genes that are DE"),
"with the following parameters: FDR <= %s, |log2FC| >= %s, base mean expression > %s"),
sum(sel),padj,
lfc,expression_cutoff))
}
# proceed only if there are DE genes
if(sum(sel) > 0){
val <- rowSums(vst[sel,sample_sel,drop=FALSE])==0
if (sum(val) >0){
warning(sprintf(paste(
ifelse(sum(val)==1,
"There is %s DE gene that has",
"There are %s DE genes that have"),
"no vst expression in the selected samples"),sum(val)))
sel[sel][val] <- FALSE
}
if(export){
if(!dir.exists(default_dir)){
dir.create(default_dir,showWarnings=FALSE,recursive=TRUE,mode="0771")
}
write.csv(res,file=file.path(default_dir,paste0(default_prefix,"-results.csv")))
write.csv(res[sel,],file.path(default_dir,paste0(default_prefix,"-genes.csv")))
}
if(plot & sum(sel)>1){
heatmap.2(t(scale(t(vst[sel,sample_sel]))),
distfun = pearson.dist,
hclustfun = function(X){hclust(X,method="ward.D2")},
trace="none",col=hpal,labRow = FALSE,
labCol=labels[sample_sel],...
)
}
}
return(list(all=rownames(res[sel,]),
up=rownames(res[sel & res$log2FoldChange > 0,]),
dn=rownames(res[sel & res$log2FoldChange < 0,])))
}
extractEnrichmentResults <- function(enrichment,task="go",
diff.exp=c("all","up","dn"),
go.namespace=c("BP","CC","MF"),
genes=NULL,export=TRUE,plot=TRUE,
default_dir=here("data/analysis/DE"),
default_prefix="DE",
url="athaliana"){
# process args
diff.exp <- match.arg(diff.exp)
de <- ifelse(diff.exp=="all","none",
ifelse(diff.exp=="dn","down",diff.exp))
# sanity
if( is.null(enrichment[[task]]) | length(enrichment[[task]]) == 0){
message(paste("No enrichment for",task))
} else {
# write out
if(export){
write_tsv(enrichment[[task]],
file=here(default_dir,
paste0(default_prefix,"-genes_GO-enrichment.tsv")))
if(!is.null(genes)){
write_tsv(
enrichedTermToGenes(genes=genes,terms=enrichment[[task]]$id,url=url,mc.cores=16L),
file=here(default_dir,
paste0(default_prefix,"-enriched-term-to-genes.tsv"))
)
}
}
if(plot){
sapply(go.namespace,function(ns){
titles <- c(BP="Biological Process",
CC="Cellular Component",
MF="Molecular Function")
suppressWarnings(tryCatch({plotEnrichedTreemap(enrichment,enrichment=task,
namespace=ns,
de=de,title=titles[ns])},
error = function(e) {
message(paste("Treemap plot failed for",ns,
"because of:",e))
return(NULL)
}))
})
}
}
}
load(here("data/analysis/salmon/dds_merge_expr_genes.rda"))
vsd <- varianceStabilizingTransformation(dds,blind=FALSE)
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not a warning or an error]
## using 'avgTxLength' from assays(dds), correcting for library size
vst <- assay(vsd)
vst <- vst - min(vst)
dir.create(here("data/analysis/DE"),showWarnings=FALSE)
save(vst,file=here("data/analysis/DE/vst-aware-exprGenes.rda"))
write_delim(as.data.frame(vst) %>% rownames_to_column("ID"),
here("data/analysis/DE/vst-aware-exprGenes.tsv"))
dds <- DESeq(dds)
## estimating size factors
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not a warning or an error]
## using 'avgTxLength' from assays(dds), correcting for library size
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not a warning or an error]
## final dispersion estimates
## Note: levels of factors in the design contain characters other than
## letters, numbers, '_' and '.'. It is recommended (but not required) to use
## only letters, numbers, and delimiters '_' or '.', as these are safe characters
## for column names in R. [This is a message, not a warning or an error]
## fitting model and testing
plotDispEsts(dds)
Check the different contrasts
resultsNames(dds)
## [1] "Intercept" "Level_60._vs_80." "Level_40._vs_80."
## [4] "Level_30._vs_80." "Level_30.7d_vs_80." "Level_Collapse_vs_80."
## [7] "Level_C2d_vs_80." "Level_Rehydrate_vs_80."
Evaluate the contrast C2d vs 80, the 80% water content in the soil is our control.
contrast_60_vs_80 <- extract_results(dds=dds,vst=vst,contrast="Level_60._vs_80.", sample_sel = dds$Level %in% c("80%","60%"), labels = dds$Level, default_prefix="DE-60vs80")
## Using the median normalized counts as default, set filter=NULL to revert to using the mean
## Loading required package: LSD
## The independent filtering cutoff is 3.92665, removing 16.96089% of the data
## The independent filtering maximises for 20.59061 % of the data, corresponding to a base mean expression of 5.67702 (library-size normalised read)
## There are 0 genes that are DE with the following parameters: FDR <= 0.01, |log2FC| >= 0.5, base mean expression > 0
contrast_40_vs_80 <- extract_results(dds=dds,vst=vst,contrast="Level_40._vs_80.", sample_sel = dds$Level %in% c("80%","40%"),labels = dds$Level, default_prefix="DE-40vs80")
## Using the median normalized counts as default, set filter=NULL to revert to using the mean
## The independent filtering cutoff is 1.82587, removing 11.5163% of the data
## The independent filtering maximises for 16.96089 % of the data, corresponding to a base mean expression of 3.42178 (library-size normalised read)
## There are 49 genes that are DE with the following parameters: FDR <= 0.01, |log2FC| >= 0.5, base mean expression > 0
contrast_30_vs_80 <- extract_results(dds=dds,vst=vst,contrast="Level_30._vs_80.", sample_sel = dds$Level %in% c("80%","30%"),labels = dds$Level, default_prefix="DE-30vs80")
## Using the median normalized counts as default, set filter=NULL to revert to using the mean
## The independent filtering cutoff is 1.82587, removing 11.5163% of the data
## The independent filtering maximises for 11.5163 % of the data, corresponding to a base mean expression of 1.09435 (library-size normalised read)
## There are 1354 genes that are DE with the following parameters: FDR <= 0.01, |log2FC| >= 0.5, base mean expression > 0
contrast_307d_vs_80 <- extract_results(dds=dds,vst=vst,contrast="Level_30.7d_vs_80.", sample_sel = dds$Level %in% c("80%","30%7d"),labels = dds$Level, default_prefix="DE-307dvs80")
## Using the median normalized counts as default, set filter=NULL to revert to using the mean
## The independent filtering cutoff is 1.13544, removing 9.701434% of the data
## The independent filtering maximises for 11.5163 % of the data, corresponding to a base mean expression of 1.09435 (library-size normalised read)
## There are 590 genes that are DE with the following parameters: FDR <= 0.01, |log2FC| >= 0.5, base mean expression > 0
contrast_Collapse_vs_80 <- extract_results(dds=dds,vst=vst,contrast="Level_Collapse_vs_80.", sample_sel = dds$Level %in% c("80%","Collapse"),labels = dds$Level, default_prefix="DE-collapsevs80")
## Using the median normalized counts as default, set filter=NULL to revert to using the mean
## The independent filtering cutoff is 0.20464, removing 6.071708% of the data
## The independent filtering maximises for 9.70143 % of the data, corresponding to a base mean expression of 0.89866 (library-size normalised read)
## There are 7375 genes that are DE with the following parameters: FDR <= 0.01, |log2FC| >= 0.5, base mean expression > 0
contrast_C2d_vs_80 <- extract_results(dds=dds,vst=vst,contrast="Level_C2d_vs_80.", sample_sel = dds$Level %in% c("80%","C2d"), labels = dds$Level, default_prefix="DE-C2dvs80")
## Using the median normalized counts as default, set filter=NULL to revert to using the mean
## The independent filtering cutoff is 0.20464, removing 6.071708% of the data
## The independent filtering maximises for 6.07171 % of the data, corresponding to a base mean expression of 0 (library-size normalised read)
## There are 10544 genes that are DE with the following parameters: FDR <= 0.01, |log2FC| >= 0.5, base mean expression > 0
contrast_C2d_vs_80_l2fc2 <- extract_results(dds=dds,vst=vst,contrast="Level_C2d_vs_80.", sample_sel = dds$Level %in% c("80%","C2d"), labels = dds$Level, default_prefix="DE-C2dvs80-lfc2", lfc=2)
## Using the median normalized counts as default, set filter=NULL to revert to using the mean
## The independent filtering cutoff is 0.20464, removing 6.071708% of the data
## The independent filtering maximises for 6.07171 % of the data, corresponding to a base mean expression of 0 (library-size normalised read)
## There are 5289 genes that are DE with the following parameters: FDR <= 0.01, |log2FC| >= 2, base mean expression > 0
contrast_Rehydrate_vs_80 <- extract_results(dds=dds,vst=vst,contrast="Level_Rehydrate_vs_80.", sample_sel = dds$Level %in% c("80%","Rehydrate"),labels = dds$Level, default_prefix="DE-Rehydratevs80")
## Using the median normalized counts as default, set filter=NULL to revert to using the mean
## The independent filtering cutoff is 0.20464, removing 6.071708% of the data
## The independent filtering maximises for 9.70143 % of the data, corresponding to a base mean expression of 0.89866 (library-size normalised read)
## There are 1790 genes that are DE with the following parameters: FDR <= 0.01, |log2FC| >= 0.5, base mean expression > 0
colors <- c("red", "blue")
contrasts_name <- c("60_vs_80", "40_vs_80", "30_vs_80", "307d_vs_80",
"Collapse_vs_80", "C2d_vs_80", "C2d_vs_80_l2fc2", "Rehydrate_vs_80")
contrasts <- c(contrast_60_vs_80, contrast_40_vs_80, contrast_30_vs_80, contrast_307d_vs_80,
contrast_Collapse_vs_80, contrast_C2d_vs_80, contrast_C2d_vs_80_l2fc2, contrast_Rehydrate_vs_80)
matrix_contrast <- matrix(lapply(contrasts[which(names(contrasts) %in% c("up", "dn"))], function(x){length(x)}), nrow = 8, ncol = 2, byrow = TRUE)
dimnames(matrix_contrast) <- list(contrasts_name,c("up", "down"))
end_point = 0.5 + nrow(matrix_contrast) + nrow(matrix_contrast) - 1
barplot(t(matrix_contrast), main="Number of DEG in different contrasts", ylab="Number of genes", xlab="", xaxt="n", space=1,
col=colors, las=2, cex.names = 0.6, cex.axis = 0.8)
legend("topleft", colnames(matrix_contrast), cex=0.8, fill=colors)
text(seq(1.5, end_point, by=2), par("usr")[3]-0.25, srt=60, adj=1, xpd=TRUE, labels=paste(rownames(matrix_contrast)), cex=0.7)
background <- rownames(vst)[featureSelect(vst,dds$Level,exp=0.4)]
res.list <- contrast_C2d_vs_80
enr.list <- lapply(res.list, gopher,background=background, alpha=0.05, task="go",url="picab02", endpoint = "enrichment")
dev.null <- extractEnrichmentResults(enr.list$all, diff.exp= "all", url="piceab02")
## Loading required package: treemap
dev.null <- extractEnrichmentResults(enr.list$up, diff.exp= "up", url="piceab02")
dev.null <- extractEnrichmentResults(enr.list$dn, diff.exp= "dn", url="piceab02")
Note: up and down refers to C2d so the upregulated genes are upregulated in C2d and the same is true for the downregulated one
## R version 4.2.1 (2022-06-23)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.6 LTS
##
## Matrix products: default
## BLAS/LAPACK: /usr/lib/x86_64-linux-gnu/libopenblasp-r0.2.20.so
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] grid stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] treemap_2.4-3 jsonlite_1.8.4
## [3] LSD_4.1-0 limma_3.52.4
## [5] VennDiagram_1.7.3 futile.logger_1.4.3
## [7] forcats_1.0.0 stringr_1.5.0
## [9] dplyr_1.1.0 purrr_1.0.1
## [11] readr_2.1.3 tidyr_1.3.0
## [13] tibble_3.1.8 tidyverse_1.3.2
## [15] RColorBrewer_1.1-3 hyperSpec_0.100.0
## [17] xml2_1.3.3 ggplot2_3.4.1
## [19] lattice_0.20-45 here_1.0.1
## [21] gplots_3.1.3 DESeq2_1.36.0
## [23] SummarizedExperiment_1.28.0 Biobase_2.58.0
## [25] MatrixGenerics_1.10.0 matrixStats_0.63.0
## [27] GenomicRanges_1.50.1 GenomeInfoDb_1.34.4
## [29] IRanges_2.32.0 S4Vectors_0.36.1
## [31] BiocGenerics_0.44.0 data.table_1.14.6
##
## loaded via a namespace (and not attached):
## [1] readxl_1.4.1 backports_1.4.1 igraph_1.3.5
## [4] lazyeval_0.2.2 splines_4.2.0 BiocParallel_1.32.4
## [7] gridBase_0.4-7 digest_0.6.31 htmltools_0.5.4
## [10] fansi_1.0.4 magrittr_2.0.3 memoise_2.0.1
## [13] googlesheets4_1.0.1 tzdb_0.3.0 Biostrings_2.66.0
## [16] annotate_1.74.0 modelr_0.1.10 vroom_1.6.1
## [19] timechange_0.2.0 jpeg_0.1-10 colorspace_2.1-0
## [22] blob_1.2.3 rvest_1.0.3 haven_2.5.1
## [25] xfun_0.36 crayon_1.5.2 RCurl_1.98-1.10
## [28] genefilter_1.78.0 survival_3.5-0 glue_1.6.2
## [31] gtable_0.3.1 gargle_1.3.0 zlibbioc_1.44.0
## [34] XVector_0.38.0 DelayedArray_0.24.0 scales_1.2.1
## [37] futile.options_1.0.1 DBI_1.1.3 Rcpp_1.0.10
## [40] xtable_1.8-4 bit_4.0.5 httr_1.4.4
## [43] ellipsis_0.3.2 pkgconfig_2.0.3 XML_3.99-0.13
## [46] sass_0.4.5 dbplyr_2.3.0 deldir_1.0-6
## [49] locfit_1.5-9.7 utf8_1.2.3 tidyselect_1.2.0
## [52] rlang_1.0.6 later_1.3.0 AnnotationDbi_1.60.0
## [55] munsell_0.5.0 cellranger_1.1.0 tools_4.2.0
## [58] cachem_1.0.6 cli_3.6.0 generics_0.1.3
## [61] RSQLite_2.2.20 broom_1.0.3 evaluate_0.20
## [64] fastmap_1.1.0 yaml_2.3.7 knitr_1.42
## [67] bit64_4.0.5 fs_1.6.0 caTools_1.18.2
## [70] KEGGREST_1.38.0 mime_0.12 formatR_1.14
## [73] brio_1.1.3 compiler_4.2.0 rstudioapi_0.14
## [76] curl_5.0.0 png_0.1-8 testthat_3.1.6
## [79] reprex_2.0.2 geneplotter_1.74.0 bslib_0.4.2
## [82] stringi_1.7.12 highr_0.10 Matrix_1.5-3
## [85] vctrs_0.5.2 pillar_1.8.1 lifecycle_1.0.3
## [88] jquerylib_0.1.4 bitops_1.0-7 httpuv_1.6.8
## [91] R6_2.5.1 latticeExtra_0.6-30 promises_1.2.0.1
## [94] KernSmooth_2.23-20 codetools_0.2-18 lambda.r_1.2.4
## [97] gtools_3.9.4 assertthat_0.2.1 rprojroot_2.0.3
## [100] withr_2.5.0 GenomeInfoDbData_1.2.9 parallel_4.2.0
## [103] hms_1.1.2 rmarkdown_2.20 googledrive_2.0.0
## [106] shiny_1.7.4 lubridate_1.9.1 interp_1.1-3